* Preparation of dataset for IV analysis
clear all // Clear all variables and settings

*****************************************************************************
* ALL COMPANIES THAT BUILD OR BUY WITH INTERNAL HUMAN CAPITAL (HC)
*****************************************************************************
clear 
use "${input_stata}\marche_f_HC_t0_nbheur.dta" // Load initial dataset

* Merge multiple datasets by common identifiers (sirtg, year, sector)
merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t0_nombre"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t0_s_brut"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t1_nbheur"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t1_nombre"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t1_s_brut"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\HC_secteur"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\distance_within_nombre"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\distance_bvrs"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\inv_final.dta"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\HC_alternative.dta"
drop _merge

* Merging by m:1 relation (sirtg-year combination)
merge m:1 sirtg year using "${input_stata}\HC_own_sector_t1_nombre"
drop _merge

merge m:1 sirtg year code using "${input_stata}\HC_bartik_nombre"
keep if _merge == 3
drop _merge

merge m:1 sirtg year code using "${input_stata}\HC_bartik_nbheur"
keep if _merge == 3
drop _merge

merge m:1 sirtg year code using "${input_stata}\HC_bartik_s_brut"
keep if _merge == 3
drop _merge

merge m:1 sirtg year code using "${input_stata}\merger_waves"
keep if _merge == 3 | _merge == 1 
drop _merge

* Standardize the 'code_entry' variable (convert to string and pad with zeros)
rename code code_entry
tostring code_entry, replace
replace code_entry = "0" + code_entry if length(code_entry) < 3
replace code_entry = "0" + code_entry if length(code_entry) < 3

*****************************************************************************
* RESTRICTING TO COMPANIES IN THE MATCHED GROUP
*****************************************************************************
merge 1:1 sirtg year code_entry using "${input_stata}\list_build.dta"
drop _merge

merge 1:1 sirtg year code_entry using "${input_stata}\list_buy.dta", update
drop _merge

*****************************************************************************
* CONTROL VARIABLES (FIXED AT THE SIRTG LEVEL, TAKEN FROM THE FIRST SAMPLE YEAR)
*****************************************************************************
merge m:1 sirtg using "${input_stata}\ctrl_var_t0.dta"
drop _merge

merge m:1 sirtg year using "${input_stata}\ctrl_var_t1.dta"
drop _merge

* Generating various control variables for the years t0 and t1
gen vaj_eff_t0 = vaj_t0 / (eff_3112_t0 * 1000) // Value-added per employee (t0)
gen diversity_t0 = nb_pcs_t0 / eff_3112_t0 // Diversity (t0)
gen size_t0 = log(eff_3112_t0) // Log of firm size (t0)
gen immo_eff_t0 = immocorp_t0 / (eff_3112_t0 * 1000) // Tangible assets per employee (t0)
gen sal_eff_t0 = (sum_s_brut_t0 / eff_3112_t0) * 1000 // Wages per employee (t0)
gen tresact_eff_t0 = tresact_t0 / (eff_3112_t0 * 1000) // Cash per employee (t0)

gen vaj_eff_t1 = vaj_t1 / (eff_3112_t1 * 1000) // Value-added per employee (t1)
gen diversity_t1 = nb_pcs_t1 / eff_3112_t1 // Diversity (t1)
gen size_t1 = log(eff_3112_t1) // Log of firm size (t1)
gen immo_eff_t1 = immocorp_t1 / (eff_3112_t1 * 1000) // Tangible assets per employee (t1)
gen sal_eff_t1 = (sum_s_brut_t1 / eff_3112_t1) * 1000 // Wages per employee (t1)
gen tresact_eff_t1 = tresact_t1 / (eff_3112_t1 * 1000) // Cash per employee (t1)

* Generate grouping variables based on entry code, industry, and sales type
egen orig_dest_size = group(code_entry apgr_1 type_sales_1)

* Create dummy variables for type of firm entry
tab entree
gen buy = entree == "external" if entree != "" // External entry
gen int_newfirm = entree == "int_newfirm" if entree != "" // Internal entry (new firm)
gen int_oldfirm = entree == "int_const" if entree != "" // Internal entry (existing firm)

* Convert string variables to numeric format
destring apgr_1, gen(apgr_1_num) force
destring code_entry, gen(code_entry_num) force

***************************
* LISTED COMPANIES
***************************
merge m:1 sirtg year using "${output_stata}\listed_group.dta"
keep if _merge == 1 | _merge == 3 
replace listed_gr = 0 if _merge == 1
drop _merge

*************************
* ENTRY AND EXIT DISTANCES
*************************

* Generate same-industry flags for 1-digit and 2-digit industry levels
gen same1 = substr(apgr_1, 1, 1) == substr(code_entry, 1, 1) // Same 1-digit industry
gen same2 = substr(apgr_1, 1, 2) == substr(code_entry, 1, 2) // Same 2-digit industry

* Handling vertical distance based on industry input-output relationships
gen apgr_1_2 = substr(apgr_1, 1, 2) // Extract 2-digit industry
destring apgr_1_2, replace force
gen code_2 = substr(code_entry, 1, 2) // Extract 2-digit entry code
destring code_2, replace force

**** UPSTREAM RELATIONSHIPS ****
rename (apgr_1_num code_entry_num) (to from)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta"
drop if _merge ==2 
drop _merge

rename (to from) (apgr_1_num code_entry_num)
rename (apgr_1_2 code_entry_num) (to from)

merge m:1 from to using "${input_r}sources\Input_Output\io_2017.dta", update
drop if _merge ==2 
drop _merge

rename (to from) (apgr_1_2 code_entry_num)
rename (apgr_1_num code_2) (to from)

merge m:1 from to using "${input_r}sources\Input_Output\io_2017.dta", update
drop if _merge ==2 
drop _merge

rename (to from) (apgr_1_num code_2)
rename (apgr_1_2 code_2) (to from)

merge m:1 from to using "${input_r}sources\Input_Output\io_2017.dta", update
drop if _merge ==2 
drop _merge

rename (to from) (apgr_1_2 code_2)
rename share link_upstream

****DOWNSTREAM*****
rename (apgr_1_num code_entry_num) (from to)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta"
drop if _merge ==2 
drop _merge

rename (from to) (apgr_1_num code_entry_num)
rename (apgr_1_2 code_entry_num) (from to)

merge m:1 from to using "${input_r}sources\Input_Output\io_2017.dta", update
drop if _merge ==2 
drop _merge

rename (from to) (apgr_1_2 code_entry_num)
rename (apgr_1_num code_2) (from to)

merge m:1 from to using "${input_r}sources\Input_Output\io_2017.dta", update
drop if _merge ==2 
drop _merge

rename (from to) (apgr_1_num code_2)
rename (apgr_1_2 code_2) (from to)

merge m:1 from to using "${input_r}sources\Input_Output\io_2017.dta", update
drop if _merge ==2 
drop _merge

rename (from to) (apgr_1_2 code_2)
rename share link_downstream

************************
* ENTRY INTO NEW GEOGRAPHIC ZONES
************************

merge m:1 sirtg code_entry year using "${input_stata}\entry_geo_dep.dta"
drop if _merge == 2 
drop _merge

merge m:1 sirtg code_entry year using "${input_stata}\entry_geo_reg.dta"
drop if _merge == 2 
drop _merge

*************************
* CREATION OF NEW FIRMS & CROSS-SECTIONAL SECTOR DATA
*************************

merge m:1 code_entry year using "${input_stata}\creation.dta"
drop if _merge == 2 
drop _merge

destring code_entry, force gen(code)
merge m:1 code year using "${input_stata}\cross_section_secteur.dta"
drop if _merge == 2 
drop _merge
drop code

*************************
* VARIABLES
*************************
* (1) New firm creation
gen g_creation = (nombre - l1_nombre) / l1_nombre
summ nombre g_creation, d

* (2) Growth in sector sales (change in sector sales)
gen g_secteur_sales = (ca - l1_ca) / l1_ca
summ ca g_secteur_sales, d

* (3) Market competition (Herfindahl index)
summ herfindahl, d

* (4) Risk sector/Uncertainty (standard deviation of sector sales)
summ sd_time_growth_agg_ca sd_growth_siren_ca

* Apply winsorization to cap extreme values
winsor2 g_secteur_sales g_creation, cut(1 99) replace

* Generate standardized variables and calculate shares
destring sirtg, gen(sirtg_num) force
summ sales l1_cagr, d
gen share = sales / l1_cagr 
summ share, d

local var _nbheur _s_brut _nombre 
foreach x of local var {
    gen HC_distance`x'_t0 = 1 - HC_overlap_t0`x'
    gen HC_distance`x'_t1 = 1 - HC_overlap_t1`x' 
}

gen HC_distance_secteur = 1 - HC_secteur
summ HC_distance_secteur*

replace HC_distance_init = 1 - HC_distance_init

*************************
* LABEL VARIABLES
*************************

label var buy "\mathbb{1}(Buy)$\_{\textit{g,n,t}}$"
label var int_newfirm "\mathbb{1}(Build new firm)$\_{\textit{g,n,t}}$"
label var int_oldfirm "\mathbb{1}(Build old firm)$\_{\textit{g,n,t}}$"
label var HC_distance_nombre_t1 "HC Distance$\_{\textit{g,n,t-1}}$"
label var HC_distance_s_brut_t1 "HC Distance$\_{\textit{g,n,t-1}}$ (wages)"
label var HC_distance_nbheur_t1 "HC Distance$\_{\textit{g,n,t-1}}$ (hours)"
label variable HC_distance_nombre_t0 "HC Distance$\_{\textit{g,n,t0}}$"
label variable HC_distance_nbheur_t0 "HC Distance$\_{\textit{g,n,t0}}$ (hours)"
label variable HC_distance_s_brut_t0 "HC Distance$\_{\textit{g,n,t0}}$ (wages)"
label variable HC_distance_secteur "HC Distance$\_{\textit{g,n,t-1}}^{Sector}$" // will be modified
label variable vaj_eff_t0 "Value added/N. Employees$\_{\textit{g,t0}}$"
label variable diversity_t0 "N. Occupations/N. Employees$\_{\textit{g,t0}}$"
label variable size_t0 "log(N. Employees)$\_{\textit{g,t0}}$"
label variable immo_eff_t0 "Tangible Assets/N. Employees$\_{\textit{g,t0}}$"
label variable sal_eff_t0 "Total wages/N. Employees$\_{\textit{g,t0}}$"
label variable tresact_eff_t0 "Cash/N. Employees$\_{\textit{g,t0}}$"
label variable vaj_eff_t1 "Value added/N. Employees$\_{\textit{g,t-1}}$"
label variable diversity_t1 "N. Occupations/N. Employees$\_{\textit{g,t-1}}$"
label variable size_t1 "log(N. Employees)$\_{\textit{g,t-1}}$"
label variable immo_eff_t1 "Tangible Assets/N. Employees$\_{\textit{g,t-1}}$"
label variable sal_eff_t1 "Total wages/N. Employees$\_{\textit{g,t-1}}$"
label variable tresact_eff_t1 "Cash/N. Employees$\_{\textit{g,t-1}}$"
label var share "Entry sales/Lagged total sales$\_{\textit{g,n,t}}$"
replace sales=sales/1000
label var sales "Entry sales (M\euro{})$\_{\textit{g,n,t}}$"
label variable listed_gr "\mathbb{1}(Public)$\_{\textit{g,t-1}}$" 
label var same1 "\mathbb{1}(Same 1-digit Industry)$\_{\textit{o,n}}$" 
label var same2 "\mathbb{1}(Same 2-digit Industry)$\_{\textit{o,n}}$"

gen upstream01 = link_upstream > 0.01
gen upstream05 = link_upstream > 0.05
gen upstream10 = link_upstream > 0.1
gen upstream20 = link_upstream > 0.2
label var link_upstream "Upstream link$\_{\textit{o,n}}$"
label var upstream05 "$\mathbb{1}$(Upstream link $>$5\%)$\_{\textit{o,n}}$" 
label var upstream01 "$\mathbb{1}$(Upstream link $>$1\%)$\_{\textit{o,n}}$" 

gen downstream01 = link_downstream > 0.01
gen downstream05 = link_downstream > 0.05
gen downstream10 = link_downstream > 0.1
gen downstream20 = link_downstream > 0.2
label var link_downstream "Downstream link$\_{\textit{o,n}}$"
label var downstream05 "$\mathbb{1}$(Downstream link $>$5\%)$\_{\textit{o,n}}$" 
label var downstream01 "$\mathbb{1}$(Downstream link $>$1\%)$\_{\textit{o,n}}$" 

rename distance distance_bvrs
label var distance_bvrs "Product market distance$\_{\textit{g,n,t-1}}$"

label var entree_dep "\mathbb{1}(New department)$\_{\textit{g,t}}$" 
label var entree_reg "\mathbb{1}(New region)$\_{\textit{g,t}}$"
label variable nombre "N. New firms$\_{\textit{n,t-1}}$" 
drop diversifie
gen diversifie = nb_secteur > 1
label variable diversifie "\mathbb{1}(Diversified)$\_{\textit{g,t-1}}$"
label variable nb_secteur "N. Sectors$\_{\textit{g,t-1}}$"

******************
* FIXED EFFECTS
******************

egen orig_size = group(type_sales_1 apgr_1)
egen orig_dest = group(code_entry apgr_1)
egen orig_dest_year = group(code_entry apgr_1 year)
egen orig1_orig2_dest = group(code_entry apgr_1 apgr_2)
egen orig_labprod_growth_dest = group(code_entry apgr_1 type_labprod type_growth)

save "${output_stata}\bartik_regression.dta", replace